xxxxxxxxxxLets first import required libraries:Lets first import required libraries:
import pandas as pdimport pylab as plimport numpy as npimport scipy.optimize as optfrom sklearn import preprocessingimport seaborn as snsimport matplotlib%matplotlib inline import matplotlib.pyplot as pltimport sidetablematplotlib.rcParams['font.size'] = 18import seaborn as snssns.set_context('talk', font_scale=1.2);df = pd.read_csv('PovidersUnique.csv')df.head()df.dtypesdf.columnsdf.stb.freq(['Fraud'])[:5]import matplotlib.backends.qt_editor.formsubplottoolAvgamt= pd.pivot_table(df, index = 'Fraud', values = 'AmtReimbursed', aggfunc = 'mean')Avgamt.plot(kind='bar', figsize=(10,5))df.groupby('Fraud').AvgAmtReimb.mean().plot(kind='bar').set_xlabel('Fraud')df.groupby('Fraud').AvgAmtDed.mean().plot(kind='bar').set_xlabel('Fraud')xxxxxxxxxx<h2 id="preprocessing">Data pre-processing and selection</h2>import plotly.express as pxfig = px.box(df, x="Fraud", y="AmtReimbursed", points='all')fig.show()import plotly.express as pxfig = px.box(df, x="Fraud", y="AmtReimbursed")fig.show()import plotly.express as pxfig = px.box(df, x="Fraud", y="CountOfCID")fig.show()import plotly.express as pxfig = px.box(df, x="Fraud", y="CountOfBene")fig.show()import plotly.express as pxfig = px.box(df, x="Fraud", y="AvgAmtReimb")fig.show()df['Fraud']=df['Fraud'].apply(lambda x: 1 if x=='Yes' else 0)# write your code heredf.describe()df.columnssns.pairplot(df[['Fraud', 'DeductibleAmt', 'AmtReimbursed', 'CountOfCID', 'AvgAmtReimb', 'AvgAmtDed', 'CountOfBene', 'AvgReimbPerBene', 'AvgDedPerBene', 'AvgAge']], hue= 'Fraud' ,diag_kind= 'kde', plot_kws = {'alpha': 0.6, 's':80, 'edgecolor': 'k'}, size=4);xxxxxxxxxxdf.fillna(0) Zerodf.dropna(0) df.fillna(method ='pad')previous df.fillna(method ='bfill')nextonedf.fillna(0) Zero df.dropna(0)
df.fillna(method ='pad')previous
df.fillna(method ='bfill')nextone
df[['Fraud', 'DeductibleAmt', 'AmtReimbursed', 'CountOfCID', 'AvgAmtReimb', 'AvgAmtDed', 'CountOfBene', 'AvgReimbPerBene', 'AvgDedPerBene', 'AvgAge']].corr()xxxxxxxxxxLets define X, and y for our dataset:Lets define X, and y for our dataset:
X= np.asarray(df[['DeductibleAmt', 'AmtReimbursed', 'CountOfCID', 'AvgAmtReimb', 'AvgAmtDed', 'CountOfBene', 'AvgReimbPerBene', 'AvgDedPerBene']])y = np.asarray(df['Fraud'])print(X.shape,y.shape)#from sklearn import preprocessingX = preprocessing.StandardScaler().fit(X).transform(X)from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.10, random_state=1)print ('Train set:', X_train.shape, y_train.shape)print ('Test set:', X_test.shape, y_test.shape)xxxxxxxxxxBased on the count of each section, we can calculate precision and recall of each label:- __Precision__ is a measure of the accuracy provided that a class label has been predicted. It is defined by: precision = TP / (TP + FP)- __Recall__ is true positive rate. It is defined as: Recall = TP / (TP + FN) So, we can calculate precision and recall of each class.__F1 score:__Now we are in the position to calculate the F1 scores for each label based on the precision and recall of that label. The F1 score is the harmonic average of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0. It is a good way to show that a classifer has a good value for both recall and precision.And finally, we can tell the average accuracy for this classifier is the average of the F1-score for both labels, which is 0.72 in our case.Based on the count of each section, we can calculate precision and recall of each label:
Precision is a measure of the accuracy provided that a class label has been predicted. It is defined by: precision = TP / (TP + FP)
Recall is true positive rate. It is defined as: Recall = TP / (TP + FN)
So, we can calculate precision and recall of each class.
F1 score: Now we are in the position to calculate the F1 scores for each label based on the precision and recall of that label.
The F1 score is the harmonic average of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0. It is a good way to show that a classifer has a good value for both recall and precision.
And finally, we can tell the average accuracy for this classifier is the average of the F1-score for both labels, which is 0.72 in our case.
from sklearn.linear_model import LogisticRegressionfrom sklearn.metrics import plot_confusion_matrixfrom sklearn.metrics import confusion_matrix, classification_reportfrom sklearn.utils import class_weightclass_weights = {0:0.08,1:0.92}# import some data to play with# Run classifier, using a model that is too regularized (C too low) to see# the impact on the resultsLR = LogisticRegression(C=0.1, solver='liblinear',class_weight=class_weights).fit(X_train,y_train)np.set_printoptions(precision=2)# Plot non-normalized confusion matrixtitles_options = [("Confusion matrix", None), ("Normalized confusion matrix", 'true')]for title, normalize in titles_options: disp = plot_confusion_matrix(LR, X_test, y_test, display_labels=['Fraud 1','not_Fraud=0'], cmap=plt.cm.Blues, normalize=normalize) disp.ax_.set_title(title) print(title) print(disp.confusion_matrix)plt.show()yhat_lr = LR.predict(X_test)print (classification_report(y_test, yhat_lr))from sklearn.model_selection import cross_validatefrom sklearn.metrics import recall_scorefrom sklearn.metrics import make_scorerscoring = {'prec_macro': 'precision_macro', 'rec_macro': make_scorer(recall_score, average='macro')}scores = cross_validate(LR, X, y, scoring=scoring, cv=5, return_train_score=True)sorted(scores.keys())scores['train_rec_macro']LR.score(X_test, y_test)from sklearn import svmclass_weightss = {0:0.08,1:0.92}svm = svm.SVC(kernel='linear', C=0.1, class_weight=class_weightss).fit(X_train, y_train)np.set_printoptions(precision=2)# Plot non-normalized confusion matrixtitles_options = [("Confusion matrix", None), ("Normalized confusion matrix", 'true')]for title, normalize in titles_options: disp = plot_confusion_matrix(svm, X_test, y_test, display_labels=['Fraud','not_Fraud'], cmap=plt.cm.Blues, normalize=normalize) disp.ax_.set_title(title) print(title) print(disp.confusion_matrix)plt.show()def f_importances(coef, names): imp = abs(coef) imp,names = zip(*sorted(zip(imp,names))) plt.barh(range(len(names)), imp, align='center') plt.yticks(range(len(names)), names) plt.show() features_names = df.columns[2:-1]f_importances(*svm.coef_, features_names)yhat_svm = svm.predict(X_test)print (classification_report(y_test, yhat_svm))xxxxxxxxxxfrom sklearn.ensemble import RandomForestClassifier# import some data to play with# Run classifier, using a model that is too regularized (C too low) to see# the impact on the resultsrf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1,class_weight=class_weights).fit(X_train,y_train)np.set_printoptions(precision=2)xxxxxxxxxxyhat_rf = rf.predict(X_test)print (classification_report(y_test, yhat_rf))xxxxxxxxxxLABELS=['Fraud','not_Fraud']conf_matrix = confusion_matrix(y_test, yhat_rf)plt.figure(figsize=(6, 6))sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");plt.title("Confusion matrix")plt.ylabel('True class')plt.xlabel('Predicted class')plt.show()xxxxxxxxxximport xgboost as xgbD_train = xgb.DMatrix(X_train, label=y_train)D_test = xgb.DMatrix(X_test, label=y_test)xxxxxxxxxxfrom sklearn.metrics import f1_score, roc_auc_scorefrom sklearn.model_selection import GridSearchCVcv_folds=5early_stopping_rounds=50alg = xgb.XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)print("Start Feeding Data")xgb_param = alg.get_xgb_params()xgtrain = xgb.DMatrix(X_train, label=y_train)# xgtest = xgb.DMatrix(X_test.values, label=y_test.values)cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds)alg.set_params(n_estimators=cvresult.shape[0]) print('Start Training')alg.fit(X_train, y_train, eval_metric='auc')xxxxxxxxxxclf = xgb.XGBClassifier(learning_rate=0.01, n_estimators=86, max_depth=5, min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0, objective='binary:logistic', nthread=4, scale_pos_weight=6, seed=27)xgbb = clf.fit(X_train,y_train)np.set_printoptions(precision=2)# Plot non-normalized confusion matrixtitles_options = [("Confusion matrix", None), ("Normalized confusion matrix", 'true')]for title, normalize in titles_options: disp = plot_confusion_matrix(xgbb, X_test, y_test, display_labels=['Fraud 1','not_Fraud=0'], cmap=plt.cm.Blues, ) disp.ax_.set_title(title) print(title) print(disp.confusion_matrix)plt.show()xxxxxxxxxxLABELS=['Fraud','not_Fraud']conf_matrix = confusion_matrix(y_test, yhat_xgb)plt.figure(figsize=(6, 6))sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");plt.title("Confusion matrix")plt.ylabel('True class')plt.xlabel('Predicted class')plt.show()xxxxxxxxxxyhat_xgb = xgbb.predict(X_test)print (classification_report(y_test, yhat_xgb))xxxxxxxxxxfeature_important = xgbb.get_booster().get_score(importance_type='weight')keys = list(feature_important.keys())values = list(feature_important.values())data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)data.plot(kind='barh')xxxxxxxxxxsorted_idx = np.argsort(xgbb.feature_importances_)[::-1]sorted_idxxxxxxxxxxxfeature=['DeductibleAmt', 'AmtReimbursed', 'CountOfCID', 'AvgAmtReimb', 'AvgAmtDed', 'CountOfBene', 'AvgReimbPerBene', 'AvgDedPerBene', 'AvgAge']xxxxxxxxxxfor index in sorted_idx: print(feature[index], xgbb.feature_importances_[index])xxxxxxxxxxxgb.plot_importance(xgbb, max_num_features = 15)plt.show()xxxxxxxxxxfrom sklearn import metricsprint("LR Accuracy : ", np.round(metrics.accuracy_score(y_test, yhat_lr),2))print("SVM Accuracy : ", np.round(metrics.accuracy_score(y_test, yhat_svm),2))print("Random Forest Accuracy : ", np.round(metrics.accuracy_score(y_test, yhat_rf),2))print("XGB Accuracy : ", np.round(metrics.accuracy_score(y_test, yhat_xgb),2))